#!/usr/bin/env python2.3 # # Validation Test.py # # This program takes in an expression data file # and a BPM network and outputs the result of # the validation test described in "Evaluating # Between-Pathway Models with Expression Data", # published in RECOMB 2009 # # Created by Max Leiserson on 7/30/08. # import time import sys import string import random import os def readBPMs (response): FILE = open (response, 'r') line = 'not null' partition1 = [] partition2 = [] i = 0 while line != '': line = FILE.readline() line = line.rstrip() array = line.split ("\t") if ((i %2) == 0): partition1.append (array) else: partition2.append (array) i = i + 1 total = [] total.append (partition1) total.append (partition2) response = response.strip('BPM Networks/') response = response.rstrip('.tx') if response.count('KI ') > 0: currentTime = 'Output/KI Pruned' + '/' + time.strftime("%d%b%Y", time.localtime()) +' ' + time.strftime("%H_%M_%S", time.localtime()) os.mkdir (currentTime) os.chdir (currentTime) else: currentTime = 'Output/' + response + '/' + time.strftime("%d%b%Y", time.localtime()) +' ' + time.strftime("%H_%M_%S", time.localtime()) os.mkdir (currentTime) os.chdir (currentTime) print 'Dataset:', response return total # retrieve all the KO data from the hughes dataset def getKnockouts(expression_data): #open file geneList = [] knockoutInfo =[] prunedList = [] #all the genes in the hughes file #fix later - read until file ends for i in range(0, 4): geneList.append(expression_data.readline()) #print geneList[2] tempGenes = geneList[2].split('\t') del tempGenes[0:4] hughGeneList=[] indices = [] #put gene column titles *top row* stored at 0 for i in range(0, len(tempGenes)): if(i%4 == 0): genes = [] if (tempGenes[i].count(' (') > 0): arr = tempGenes[i].split(' (') gene = arr[0].upper() else: gene = tempGenes[i].upper() if (gene.count(',')): arr2 = gene.split(',') if (arr2[0].count('"') > 0): arr3 = arr2[0].split('"') genes.append(arr3[1].upper()) else: genes.append(arr2[0]) if (arr2[1].count('"') > 0): arr4 = arr2[1].split('"') genes.append(arr4[0].upper()) else: genes.append(arr2[1].upper()) else: genes.append(gene) hughGeneList.append(genes) prunedList.append(genes[0]) indices.append(i/4) if (len(genes) > 1): prunedList.append(genes[1]) indices.append(i) # put a list of all the genes tested in hughes in index 0 knockoutInfo.append(prunedList) next = geneList[3].split('\t') del next[0:2] stats = next[0:4] #put stat titles in knockout info at index 1 knockoutInfo.append(stats) #put knockout gene *first column* in knockout info in index 2 knockouts = [] line = expression_data.readline() while line != '': data = line.split('\t') knockouts.append(data[0].upper()) line = expression_data.readline() knockoutInfo.append(knockouts) knockoutInfo.append(hughGeneList) knockoutInfo.append(indices) return knockoutInfo def readRegistryGenenames(): #opens the file of genenames and common names, and creates a dictionary where one can find the common name by entering the genename openFile = open ('../../../registry.genenames.tab', 'r') lines = [] genenames = {} for line in openFile: lines.append (line) for item in lines: anArray = item.rsplit ('\t') genenames[anArray[-2]] = anArray[0] # in the file, the first item is the common name and the second to last item is the genename return genenames def ourGeneList(bpmList): ourList = [] ourListY = [] ourListCommon = [] for i in range(0, len(bpmList[0])): for j in range(0, len(bpmList[0][i])): if(ourListY.count(bpmList[0][i][j]) == 0): ourListY.append(bpmList[0][i][j]) for i in range(0, len(bpmList[1])): for j in range(0, len(bpmList[1][i])): if(ourListY.count(bpmList[1][i][j]) == 0): ourListY.append(bpmList[1][i][j]) for i in range(0, len(bpmList[2])): for j in range(0, len(bpmList[2][i])): if(ourListCommon.count(bpmList[2][i][j]) == 0): ourListCommon.append(bpmList[2][i][j]) for i in range(0, len(bpmList[3])): for j in range(0, len(bpmList[3][i])): if(ourListCommon.count(bpmList[3][i][j]) == 0): ourListCommon.append(bpmList[3][i][j]) ourList.append(ourListY) ourList.append(ourListCommon) return ourList def theirListLocations(ourList, results): theirList = [] theirListY = [] theirListCommon = [] for i in range(0, len(ourList[0])): if(results.count(ourList[0][i]) != 0): location = results.index(ourList[0][i]) theirListY.append(location) else: theirListY.append(-1) for i in range(0, len(ourList[1])): if(results.count(ourList[1][i]) != 0): location = results.index(ourList[1][i]) theirListCommon.append(location) else: theirListCommon.append(-1) theirList.append(theirListY) theirList.append(theirListCommon) return theirList def getHughesColumn(expression_data, offset): expression_data.seek(0,0) returnData = [] for k in range (0, 4): junk = expression_data.readline() line = expression_data.readline() while line != '': temp = [] data = line.split('\t') temp.append(data[0]) temp.append(data[offset*4+2]) temp.append(data[offset*4+3]) temp.append(data[offset*4+4]) temp.append(data[offset*4+5]) returnData.append(temp) line = expression_data.readline() return returnData def getSortedMicroArray(inputMicroArray): micro = [] for j in range(0, len(inputMicroArray)): temp = [] if inputMicroArray[j][2] != ' NaN': temp.append(float(inputMicroArray[j][2])) #convert the third column to a float, take its absolute value, and append it to temp temp.append(inputMicroArray[j][0]) micro.append(temp) micro.sort() #sort the array micro.reverse() #reverse it, so it is in the correct order return micro #comments # getSortedArray does the same thing as getSortedMicroArray, except it doesn't include the names in the array def getSortedArray (inputMicroArray): micro = [] for j in range(0, len(inputMicroArray)): if inputMicroArray[j][2] != ' NaN': micro.append(float(inputMicroArray[j][2])) micro.sort() micro.reverse() return micro def getSortedMicroArrayAbs(inputMicroArray): micro = [] for j in range(0, len(inputMicroArray)): temp = [] if inputMicroArray[j][2] != ' NaN': temp.append(abs(float(inputMicroArray[j][2]))) #convert the third column to a float, take its absolute value, and append it to temp temp.append(inputMicroArray[j][0]) micro.append(temp) micro.sort() #sort the array micro.reverse() #reverse it, so it is in the correct order return micro #comments # getSortedArray does the same thing as getSortedMicroArray, except it doesn't include the names in the array def getSortedArrayAbs (inputMicroArray): micro = [] for j in range(0, len(inputMicroArray)): if inputMicroArray[j][2] != ' NaN': micro.append(abs(float(inputMicroArray[j][2]))) micro.sort() micro.reverse() return micro #comments #inputArray is a sorted micro array #geneset is the overall gene set to test #index is the index up to which N_r will be calculated def calculateN_r(inputArray, geneList): N_r = 0 for i in range(0,len(inputArray)): for j in range(0, len(geneList)): #print '---' #print 'inputArray ', inputArray[i][1] #print '--' #print 'geneList ', geneList[j] if inputArray[i][1] == geneList[j]: N_r += abs(inputArray[i][0]) if N_r == 0: print 'ERROR with N_r count' return -1 else: return float(N_r) #comments # add up the absolute values of the log10 values for an inputArray def calculateN_r2 (inputArray): total = 0.0 for i in range (len (inputArray)): total += abs(inputArray[i][0]) return total def generanks (sorted, genelist): array = [] n = float(len(sorted)) sortedNames = [] sorted.reverse() for i in range (len (sorted)): sortedNames.append (sorted[i][1]) for item in genelist: temp = [] temp.append (((((n/2) - float(sortedNames.index (item))) / (n/2)))) temp.append (float(sortedNames.index (item))) temp.append(item) array.append (temp) array.sort() array.reverse() return array def generanksAbs (sorted, genelist): array = [] sorted.reverse() sortedNames = [] newSorted = [] numberBefore = 0 numberDistinct = 0 currentDupes = 0 currentValue = sorted[0][0] for z in range(len(sorted)): temp = [] if(sorted[z][0] != currentValue): numberBefore +=currentDupes currentDupes = 0 numberDistinct +=1 temp.append(numberBefore) temp.append(sorted[z][0]) temp.append(sorted[z][1]) newSorted.append(temp) numberBefore +=1 else: currentDupes +=1 temp.append(numberBefore) temp.append(sorted[z][0]) temp.append(sorted[z][1]) newSorted.append(temp) currentValue = sorted[z][0] finalSort = [] rankWeight = len(sorted) currentDupes = 0 currentValue = sorted[0][0] for i in range(len(sorted)): temp = [] if(sorted[i][0] != currentValue): rankWeight -= currentDupes currentDupes = 0 temp.append(rankWeight) temp.append(newSorted[i][0]) temp.append(newSorted[i][1]) temp.append(newSorted[i][2]) finalSort.append(temp) rankWeight -=1 else: currentDupes += 1 temp.append(rankWeight) temp.append(newSorted[i][0]) temp.append(newSorted[i][1]) temp.append(newSorted[i][2]) finalSort.append(temp) currentValue = sorted[i][0] returnSet = [] for k in range (len(finalSort)): if(counter(genelist, finalSort[k][3]) != 0): returnSet.append(finalSort[k]) return returnSet def counter(list, item): count = 0 for i in range(0, len(list)): if (list[i] == item): count +=1 return count def gsea(inputMicroArray, geneList, gseaType): #sort inputMicroArray by log10 score sorted = getSortedMicroArray(inputMicroArray) #switch it to smallest to biggest #----optimize later sorted.reverse() geneRanks = generanksAbs(sorted, geneList) maxDiff = 0 answer = [] tempAnswer = 0 runningHit = 0 temp = 0 nr = calculateN_r2(geneRanks, gseaType) N = float(len(inputMicroArray)) Nh = float(len(geneRanks)) for i in range(0, len(geneRanks)): #test right before we see a new element - #misses have reached their max if (N == 0): N = 1 if (nr == 0): nr = 1 try: temp = (runningHit/nr) - ((geneRanks[i][1])/N) except ZeroDivisionError: temp = 0 if (abs(temp) > maxDiff): maxDiff = abs(temp) tempAnswer = temp #add the element we are on runningHit += abs(geneRanks[i][2]) temp = (runningHit/nr) - ((geneRanks[i][1])/N) if (abs(temp) > maxDiff): maxDiff = abs(temp) tempAnswer = temp answer.append(tempAnswer) return answer def gsea2(inputMicroArray, geneList): #sort inputMicroArray by log10 score sorted = getSortedMicroArray(inputMicroArray) #switch it to smallest to biggest #----optimize later sorted.reverse() geneRanks = generanks(sorted, geneList) maxDiff = 0 answer = [] tempAnswer = 0 runningHit = 0 nr = calculateN_r2(geneRanks, '2') N = float(len(inputMicroArray)) Nh = float(len(geneRanks)) for i in range(0, len(geneRanks)): #test right before we see a new element - #misses have reached their max temp = (runningHit/nr) - ((geneRanks[i][1] - i -1)/N) if (abs(temp) > maxDiff): maxDiff = abs(temp) tempAnswer = temp #add the element we are on runningHit += abs(geneRanks[i][0]) temp = (runningHit/nr) - ((geneRanks[i][1] - i)/N) if (abs(temp) > maxDiff): maxDiff = abs(temp) tempAnswer = temp answer.append(tempAnswer) return answer def compute_rank_cluster_score(inputMicroArray, geneList): #sort inputMicroArray by log10 score sorted = getSortedMicroArrayAbs(inputMicroArray) #switch it to smallest to biggest #----optimize later sorted.reverse() geneRanks = generanksAbs(sorted, geneList) maxDiff = 0 answer = [] tempAnswer = 0 runningHit = 0 temp = 0 nr = calculateN_r2(geneRanks) N = float(len(sorted)) # sorted does not include the NaN values, which should be ignored when calculating misses Nh = float(len(geneRanks)) for i in range(0, len(geneRanks)): #test right before we see a new element - #misses have reached their max temp = (runningHit/nr) - ((geneRanks[i][1])/N) if (abs(temp) > maxDiff): maxDiff = abs(temp) tempAnswer = temp #add the element we are on runningHit += abs(geneRanks[i][0]) temp = (runningHit/nr) - ((geneRanks[i][1])/N) if (abs(temp) > maxDiff): maxDiff = abs(temp) tempAnswer = temp answer.append(tempAnswer) return answer def get_random_geneset_data(inputMicroArray, geneList): random.seed() pRaw = [] #because we want no NaN reuse sort - but don't reuse it can't be processed twice #in v2.0 write a clean function sorted = getSortedMicroArrayAbs (inputMicroArray) for i in range(0, 99): # get a random set of genes randomSet = [] while len(randomSet) < len(geneList): candidate = random.randint(0, len(sorted)-1) count = 0 for j in range(0, len(geneList)): if (sorted[candidate][1] == geneList[j]): count +=1 if count == 0: #print 'adding ', sorted[candidate][1], ' to list' randomSet.append(sorted[candidate][1]) pRaw.append (compute_rank_cluster_score(inputMicroArray, randomSet)) return pRaw def get_random_knockout_data(expression_data, knockOutPathway, knockOutPathwayY, EsPathway, EsPathwayY, knockOutList): #get random set of test - none of which are in knockout pathway #calculate the opposite pathway's es value random_CRS_scores = [] for i in range(0, 99): found = 0 rIndex = 0 random.seed() while found == 0: count = 0 rIndex = random.randint(0,len(knockOutList)-1) for a in range(len(knockOutList[rIndex])): count += counter(knockOutPathway, knockOutList[rIndex][a]) count += counter(knockOutPathwayY, knockOutList[rIndex][a]) count += counter(EsPathway, knockOutList[rIndex][a]) count += counter(EsPathwayY, knockOutList[rIndex][a]) if count == 0: found = 1 break #print 'attempting index is', rIndex random_hughes_column = getHughesColumn(expression_data, rIndex) random_CRS_scores.append (compute_rank_cluster_score(random_hughes_column, EsPathwayY)) return random_CRS_scores def calculate_p_value(CRS, random_CRS_scores): # once all the random values have been appended to random_CRS_scores, # sort random_CRS_scores so as to make finding the CRS's rank in random_CRS_scores possible random_CRS_scores.sort() p = 0.0 # for each item in random_CRS_scores, if p is greater than the CRS, incrememnt p by 1 for a in range (0, len(random_CRS_scores)): if (CRS > random_CRS_scores[a]): p += 1.0 # add an extra 1 to p, to account for the fact that counting in computers starts at 0 p += 1.0 # divide get_random_geneset_data by 100, effectively making it a p value p = p/100.0 return 1 - p def collect_bpms_that_meet_criteria(suppsInfo, knockoutInfo): outputFileString = 'All BPMs.html' # create a file in the directory that was previously created by the program that stores the list of BPMs outputFile = open(outputFileString, 'w') # that meet the required specifications outputFile.write ('
\n')
highlight = 0
genenames = readRegistryGenenames()
pathway1 = []
pathway2 = []
print 'Total BPMs: ', len(suppsInfo[1])
for i in range (len(suppsInfo[1])): # for every BPM read in either from the user or from file (suppsInfo[1] is the same length as suppsInfo[0])
temp1 = []
temp2 = []
testCounter1 = 0
testCounter2 = 0
for j in range (len(suppsInfo[0][i])): # for every member of a pathway in the first column of the current BPM
try:
testCounter1 += knockoutInfo[0].count (genenames[suppsInfo [0][i][j]]) # record the total number of tests peformed upon any gene in the pathway
except KeyError: # i.e. count the number of genes in the pathway that were knocked out
try:
testCounter1 += knockoutInfo[0].count (suppsInfo [0][i][j])
except KeyError:
counter +=1
misses.append (suppsInfo[0][i][j])
pass
for k in range (len(suppsInfo[1][i])): # for every member of a pathway in the second column of the current BPM
try:
testCounter2 += knockoutInfo[0].count (genenames[suppsInfo[1][i][k]]) # record the total number of tests performed upon any gene in the pathway
except KeyError: # i.e. count the number of genes in the pathway that were knocked out
try:
testCounter2 += knockoutInfo[0].count (suppsInfo[1][i][k])
except KeyError:
counter += 1
misses.append (suppsInfo[1][i][j])
pass
if (testCounter1 > 0 and len (suppsInfo[1][i]) > 2) or (testCounter2 > 0 and len (suppsInfo[0][i]) > 2):
# if pathway 1 contains a gene that was tested and pathway 2 is longer than 2, or if pathway 2 contains a gene that was tested and pathway 1 is longer than 2,
# add both pathway 1 and pathway 2 as a single BPM to the BPM list
outputFile.write ('\nBPM ')
outputFile.write (str(i))
outputFile.write ("(")
outputFile.write (str(highlight))
outputFile.write (")!!
")
if (testCounter1 > 0 and len (suppsInfo[1][i]) > 2) and (testCounter2 <= 0 or len (suppsInfo[0][i]) <= 2):
outputFile.write ('')
outputFile.write (str (suppsInfo[0][i]))
outputFile.write ('
')
outputFile.write (str (suppsInfo[1][i]))
elif (testCounter2 > 0 and len(suppsInfo[0][i]) > 2) and (testCounter1 <= 0 or len (suppsInfo[1][i]) <= 2):
outputFile.write (str(suppsInfo[0][i]))
outputFile.write ('
')
outputFile.write (str (suppsInfo[1][i]))
outputFile.write ('')
else:
outputFile.write ('')
outputFile.write (str (suppsInfo[0][i]))
outputFile.write ('
')
outputFile.write (str (suppsInfo[1][i]))
outputFile.write ('')
highlight += 1
outputFile.write ('
')
pathway1.append (suppsInfo[0][i]) # adding pathway 1 to the list of pathways 1
pathway2.append (suppsInfo[1][i]) # adding pathway 2 to the list of pathways 2 (since there is a one-to-one correspondence between pathway1s and pathway
# 2s, these pathways will appear at the same index in the array of all BPMs)
else:
outputFile.write ("BPM ")
outputFile.write (str(i))
outputFile.write ("!!
")
outputFile.write (str (suppsInfo[0][i]))
outputFile.write ('
')
outputFile.write (str (suppsInfo[1][i]))
outputFile.write ('
')
outputFile.write ('
Current BPM ')
outputFile.write(str(currentBPM))
outputFile.write ("
")
def initialize_genelist(bpmList, j, side, currentBPM):
genelist = []
for i in range (len(bpmList[side][currentBPM])):
genelist.append (bpmList[side][currentBPM][i])
genelist.remove (bpmList[side][currentBPM][j])
return genelist
def get_log_sorted_values(temp0, gseaType):
if gseaType == "1" or gseaType == "2":
sorted = getSortedMicroArray (temp0) # create a sorted array of the log10 values from the hughes column
else:
sorted = getSortedMicroArrayAbs (temp0) # create a sorted array of the log10 values from the hughes column taking the absolute values of the log10s
return sorted
def get_gene_ranks_KO(sorted, bpmList, currentBPM, gseaType, side):
if gseaType == "1" or gseaType == "2":
logs1 = generanksAbs (sorted, bpmList[side][currentBPM]) #generate list of all the gene ranking data from the generanksAbs function for the KO pathway
else:
logs1 = generanksAbs (sorted, bpmList[side][currentBPM]) #generate list of all the gene ranking data from the generanksAbs function for the KO pathway
return logs1
def get_gene_ranks_compensatory(sorted, bpmList, currentBPM, gseaType, side):
if gseaType == "1" or gseaType == "2":
logs2 = generanksAbs (sorted, bpmList[side][currentBPM]) #generate list of all the gene ranking data from the generanksAbs function for the current pathway
else:
logs2 = generanksAbs (sorted, bpmList[side][currentBPM]) #generate list of all the gene ranking data from the generanksAbs function for the current pathway
return logs2
def set_genenames_KO(logs1):
names1 = []
for name in logs1:
names1.append (name[3])
return names1
def set_genenames_compensatory(logs2):
names2 = []
for name2 in logs2:
names2.append (name2[3])
return names2
def write_current_knockout_stats(bpmList, currentBPM, j, outputFile, sorted, side, otherside):
outputFile.write('-----Knockout Gene:') # print the gene knocked out, both its genename and common name
outputFile.write(str(bpmList[side][currentBPM][j])) # the knockout's genename
outputFile.write(' (')
outputFile.write(str(bpmList[side + 2][currentBPM][j])) # the knockout's common name
outputFile.write (')')
outputFile.write('-----
The length of the pathway opposite the knockout: ')
outputFile.write (str(len(bpmList[otherside][currentBPM]))) # print the length of the pathway opposite the knockout
outputFile.write ("
The length of the list of log10 values: ")
outputFile.write (str(len(sorted))) # the length of the list of all log10 values, adjusted depending on the number of NaNs
outputFile.write ("
")
outputFile.write("Pathway 1\t\t\tlog10\t\t# After\t\t# Before\t\t\t\tPathway 2\t\t\tlog10\t\t# After\t\t# Before
\n")
def determine_shorter_pathway(bpmList, currentBPM):
if (len (bpmList[0][currentBPM]) < len (bpmList[1][currentBPM])): # determine which list is shorter
shorter = 0
else:
shorter = 1
return shorter
def determine_longer_pathway(currentBPM, bpmList):
if (len (bpmList[0][currentBPM]) < len (bpmList[1][currentBPM])): # determine which list is shorter
longer = 1
else:
longer = 0
return longer
def write_item_KO(bpmList, currentBPM, j, item, outputFile, a, side):
#CHANGE FROM => if (item == bpmList[side][currentBPM][a] or item == bpmList[side][currentBPM][j]):
if (bpmList[side][currentBPM][a] == bpmList[side][currentBPM][j]):
# if the current item in the pathway 1 is the knockout, underline it (item is always the genename)
outputFile.write ('')
outputFile.write (item)
outputFile.write ('')
else: # otherwise just write the item (item is always the genename)
outputFile.write (item)
def write_item_compensatory(bpmList, currentBPM, a, item, outputFile, side, good):
if (good == "yes"):
outputFile.write("")
outputFile.write (bpmList[side][currentBPM][a]) # print the opposite pathway's (pathway 2) ath gene's genename
outputFile.write ('(')
outputFile.write (bpmList[side + 2][currentBPM][a]) # print the opposite pathway's (pathway 2) ath gene's common name
outputFile.write (')')
def write_items_common_name(bpmList, currentBPM, a, j, outputFile, side):
outputFile.write ('(')
if (bpmList[side][currentBPM][a] == bpmList[side][currentBPM][j]):
# if the ath gene in the shorter pathway is the knockout, print the common name of the
# knocked out gene and underline it
outputFile.write ('')
outputFile.write (bpmList[side][currentBPM][a])
outputFile.write ('')
else: # otherwise just output the common name for the ath gene in the shorter pathway
outputFile.write (bpmList[side][currentBPM][a])
outputFile.write(')')
def write_log_and_rank(temp0, x, currentBPM, bpmList, outputFile, logs1, names1, a, side):
if ((bpmList[side][currentBPM][a] == temp0[x][0]) or (bpmList[side+2][currentBPM][a] == temp0[x][0])):
# if the genename/common name of the ath gene in the pathway (pathway 1) is equal to the name
# stored in the zth list in temp 0[z][0] (temp0[z][0] holds the name of the gene whose information
# is stored in temp0[z][1], temp0[z][2], and temp0[z][3] for the current list z)
outputFile.write ('\t\t\t')
outputFile.write (temp0[x][2] + '\t\t')
# write the information stored in the third column (log10 value) for the current gene
try: # try to print the rank for the curent gene's log10 value among all the log10 values calculated and the # of log10 values after the gene
# for the current knockout (this calculation is done in generanksAbs previously).
outputFile.write (str(logs1[names1.index (temp0[x][0])][0]) + '\t\t' + str(logs1[names1.index (temp0[x][0])][1]))
return logs1[names1.index (temp0[x][0])][0]
except ValueError:
outputFile.write ("N/A\t\tN/A")
return -0.001
else:
return 0
def get_rank(temp0, x, currentBPM, bpmList, outputFile, logs1, names1, a, side):
if ((bpmList[side][currentBPM][a] == temp0[x][0]) or (bpmList[side+2][currentBPM][a] == temp0[x][0])):
try:
return logs1[names1.index (temp0[x][0])][0]
except ValueError:
return 0
else:
return 0
def write_scores(outputFile, pFile, pFileKO, CRS_left, CRS_right, random_genesets_pValLeft, random_genesets_pVal, random_knockouts_pValLeft, random_knockouts_pVal, currentBPM):
outputFile.write ("
ES Random Genesets = ")
outputFile.write (str(CRS_left)) #output the value calculated by the compute_cluster_rank_score method for the KO pathway
outputFile.write ("\t\tES Random KOs = ")
outputFile.write (str(CRS_left)) #output the value calculated by the compute_cluster_rank_score method for the KO pathway
outputFile.write ("\t\tES Random Genesets = ")
outputFile.write (str(CRS_right)) #output the value calculated by the compute_cluster_rank_score method for the opposite pathway
outputFile.write ("\t\t\tES Random KOs = ")
outputFile.write (str(CRS_right)) #output the value calculated by the compute_cluster_rank_score method for the opposite pathway
outputFile.write ("
p Random Genesets = ")
outputFile.write (str(random_genesets_pValLeft)) # output the p value calculated by the get_random_geneset_data method for the KO pathway (see above)
outputFile.write ("\t\t\t\tp Random KOs = ")
outputFile.write (str(random_knockouts_pValLeft)) # output the p value calculated by the get_random_knockout_data method for the current pathway (see above)
outputFile.write ("\t\t\t\tp Random Genesets = ")
outputFile.write (str(random_genesets_pVal)) # output the p value calculated by the get_random_geneset_data method for the KO pathway (see above)
outputFile.write ("\t\t\t\tp Random KOs = ")
outputFile.write (str(random_knockouts_pVal)) # output the p value calculated by the get_random_knockout_data method for the current pathway (see above)
pFile.write (str(random_genesets_pVal)) # output the p value to a separate file for easy histogram creation later
pFile.write ('\t')
pFile.write (str (currentBPM)) #output the BPM of the p value to the same file as the get_random_geneset_datas, so finding validated BPMs can be done easily
pFile.write ('\t1\n')
pFileKO.write(str(random_knockouts_pVal) + '\t' + str(currentBPM) + '\t1\n')
def write_pruned(pruned, pruned_common, temp, outputFile, prunedPFile, prunedPFileKO, i, knockout_pathway_common, knockout_pathway_Y, expression_data, knockout_genes_list):
if (len(pruned) > 2):
CRS_right = compute_rank_cluster_score(temp, pruned)
random_genesets_CRS_scores = get_random_geneset_data(temp, pruned)
random_knockouts_CRS_scores = get_random_knockout_data(expression_data, knockout_pathway_common, knockout_pathway_Y, pruned_common, pruned, knockout_genes_list)
random_genesets_pVal = calculate_p_value(CRS_right, random_genesets_CRS_scores)
random_knockouts_pVal = calculate_p_value(CRS_right, random_knockouts_CRS_scores)
prunedPFile.write(str(random_genesets_pVal) + '\t' + str(i) + '\t1\n')
prunedPFileKO.write(str(random_knockouts_pVal) + '\t' + str(i) + '\t1\n')
else:
CRS_right = 'N/A'
random_genesets_pVal = 'N/A'
random_knockouts_pVal = 'N/A'
outputFile.write('\n\n\t\t\t\t\t\t\t\t\t\t\t\t\tES Pruned Random Genesets =' + str(CRS_right) + 'p Pruned Random Genesets = ' + str(random_genesets_pVal) + '\n')
outputFile.write('\t\t\t\t\t\t\t\t\t\t\t\t\tES Pruned Random KOs =' + str(CRS_right) + '\tp Pruned Random KOs = ' + str(random_knockouts_pVal))
outputFile.write ("
")
outputFile.write("
\n")
def KO_output(genelist, bpmList, j, currentBPM, temp, gseaType, outputFile, pFile, pFileKO, prunedPFile, prunedPFileKO, pruneCut, expression_data, knockout_genes_list):
CRS_right = compute_rank_cluster_score(temp, bpmList[1][currentBPM])
CRS_left = compute_rank_cluster_score(temp, genelist)
random_genesets_CRS_scores = get_random_geneset_data(temp, bpmList[1][currentBPM]) # calculate the get_random_geneset_data from the gsea value of the current pathway
random_knockouts_CRS_scores = get_random_knockout_data(expression_data, bpmList[2][currentBPM], bpmList[0][currentBPM], bpmList[3][currentBPM], bpmList[1][currentBPM], knockout_genes_list)
random_genesets_pVal = calculate_p_value(CRS_right, random_genesets_CRS_scores)
random_knockouts_pVal = calculate_p_value(CRS_right, random_knockouts_CRS_scores)
random_genesets_CRS_scoresLeft = get_random_geneset_data(temp, genelist)
random_knockouts_CRS_scoresLeft = get_random_knockout_data(expression_data, bpmList[2][currentBPM], bpmList[0][currentBPM], bpmList[2][currentBPM], bpmList[0][currentBPM], knockout_genes_list)
random_genesets_pValLeft = calculate_p_value(CRS_left, random_genesets_CRS_scoresLeft)
random_knockouts_pValLeft = calculate_p_value(CRS_left, random_knockouts_CRS_scoresLeft)
sorted = get_log_sorted_values(temp, gseaType)
logs1 = get_gene_ranks_KO(sorted, bpmList, currentBPM, gseaType, 0)
logs2 = get_gene_ranks_compensatory(sorted, bpmList, currentBPM, gseaType, 1)
names1 = set_genenames_KO(logs1) #names1 and names2 will be used to store the names of the genes in logs1 and logs2, stored with the same indices as their corresponding information in logs1 and logs2,
names2 = set_genenames_compensatory(logs2) #so names1 and names2 effectively allow easy access to logs1 and logs2
write_current_knockout_stats(bpmList, currentBPM, j, outputFile, sorted, 0, 1)
shorter = determine_shorter_pathway(bpmList, currentBPM)
longer = determine_longer_pathway(currentBPM, bpmList)
pruned = []
pruned_common = []
a = 0 # a counter that holds the number of iterations through the pathways that have occurred
for item in bpmList[shorter][currentBPM]: # for every gene in the shorter pathway of the current BPM
found = 0 # a boolean that tells whether or not the gene's log10 data has been found in temp during write_log_and_ranks
good = "not yet"
if shorter == 0: # always print the side with the knockout on the left, so if the side with the
# knockout is shorter, then output current item
write_item_KO(bpmList, currentBPM, j, item, outputFile, a, 0) #BIG CHANGE => both a's used to be j's
write_items_common_name(bpmList, currentBPM, a, j, outputFile, 2)
# CHANGE => outputFile.write ('\t\t\t\t\t')
found = 0
for x in range (len (temp)): # for the size of the hughes column
if (found != 0):
x = len(temp) + 1
else:
found = write_log_and_rank(temp, x, currentBPM, bpmList, outputFile, logs1, names1, a, 0)
found = 0
outputFile.write ('\t\t\t\t\t')
for z in range (len (temp)): # for the size of the hughes column
rank = get_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 1)
if rank != 0:
break
if (float(float(rank)/float(len(temp))) > pruneCut):
good = "yes"
pruned.append(bpmList[longer][currentBPM][a])
pruned_common.append(bpmList[longer + 2][currentBPM][a])
else:
good = "no"
write_item_compensatory(bpmList, currentBPM, a, item, outputFile, 1, good)
write_log_and_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 1)
if good == "yes":
outputFile.write("")
outputFile.write ('
')
found = 0
a+=1 # for each iteration, increment a
else: # if the shorter list isn't 0, but is in fact 1
write_item_KO(bpmList, currentBPM, j, bpmList[longer][currentBPM][a], outputFile, a, 0)
write_items_common_name(bpmList, currentBPM, a, j, outputFile, 2)
found = 0
for x in range (len (temp)): # for the size of the hughes column
if (found != 0):
x = len(temp) + 1
else:
found = write_log_and_rank(temp, x, currentBPM, bpmList, outputFile, logs1, names1, a, 0)
outputFile.write ('\t\t\t\t\t')
found = 0
for z in range (len (temp)): # for the size of the hughes column
rank = get_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 1)
if rank != 0:
break
if (float(float(rank)/float(len(temp))) > pruneCut):
good = "yes"
pruned.append(bpmList[shorter][currentBPM][a])
pruned_common.append(bpmList[shorter + 2][currentBPM][a])
else:
good = "no"
write_item_compensatory(bpmList, currentBPM, a, item, outputFile, 1, good)
found = write_log_and_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 1)
if good == "yes":
outputFile.write("")
outputFile.write ('
')
found = 0
a+=1 # increment a
# this while loop performs the same operations as above for the longer list, because the above
# loop finished the smaller list
while a < len(bpmList[longer][currentBPM]):
found = 0
good = "not yet"
if longer == 1: # if longer is 1, then it doesn't contain the knockout, so is on the right,
# so it must be moved to align with the pathway 2 information that was written above
outputFile.write ('\t\t\t\t\t\t\t\t\t\t\t\t\t')
found = 0
for z in range (len (temp)): # for the size of the hughes column
rank = get_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 1)
if rank != 0:
break
if (float(float(rank)/float(len(temp))) > pruneCut):
good = "yes"
pruned.append(bpmList[longer][currentBPM][a])
pruned_common.append(bpmList[longer + 2][currentBPM][a])
else:
good = "no"
write_item_compensatory(bpmList, currentBPM, a, bpmList[longer][currentBPM][a], outputFile, 1, good) #BIG CHANGE => changed item to bpmList[longer][currentBPM][a]
for z in range (len (temp)): # for the size of the hughes column
if (found != 0):
z = len(temp) + 1
else:
found = write_log_and_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 1) #CHANGE => 1 to 2
if good == "yes":
outputFile.write("")
found = 0
outputFile.write ('
')
a+=1 # increment a
else: # if longer is shorter, and therefore contains the knockout
write_item_KO(bpmList, currentBPM, j, bpmList[longer][currentBPM][a], outputFile, a, 0)
write_items_common_name(bpmList, currentBPM, a, j, outputFile, 2)
found = 0
for x in range (len (temp)): # for the size of the hughes column
if (found != 0):
x = len(temp) + 1
else:
found = write_log_and_rank(temp, x, currentBPM, bpmList, outputFile, logs1, names1, a, 0)
found = 0
outputFile.write ('
')
a+=1
write_scores(outputFile, pFile, pFileKO, CRS_left, CRS_right, random_genesets_pValLeft, random_genesets_pVal, random_knockouts_pValLeft, random_knockouts_pVal, currentBPM)
write_pruned(pruned, pruned_common, temp, outputFile, prunedPFile, prunedPFileKO, currentBPM, bpmList[2][currentBPM], bpmList[0][currentBPM], expression_data, knockout_genes_list)
def compensatory_output(genelist, bpmList, k, currentBPM, temp, gseaType, outputFile, pFile, pFileKO, prunedPFile, prunedPFileKO, pruneCut, expression_data, knockout_genes_list):
CRS_right = compute_rank_cluster_score(temp, bpmList[0][currentBPM])
CRS_left = compute_rank_cluster_score(temp, genelist)
random_genesets_CRS_scores = get_random_geneset_data(temp, bpmList[0][currentBPM]) # calculate the get_random_geneset_data from the gsea value of the current pathway
random_knockouts_CRS_scores = get_random_knockout_data(expression_data, bpmList[3][currentBPM], bpmList[1][currentBPM], bpmList[2][currentBPM], bpmList[0][currentBPM], knockout_genes_list)
random_genesets_pVal = calculate_p_value(CRS_right, random_genesets_CRS_scores)
random_knockouts_pVal = calculate_p_value(CRS_right, random_knockouts_CRS_scores)
random_genesets_CRS_scoresLeft = get_random_geneset_data(temp, genelist)
random_knockouts_CRS_scoresLeft = get_random_knockout_data(expression_data, bpmList[3][currentBPM], bpmList[1][currentBPM], bpmList[3][currentBPM], bpmList[1][currentBPM], knockout_genes_list)
random_genesets_pValLeft = calculate_p_value(CRS_left, random_genesets_CRS_scores)
random_knockouts_pValLeft = calculate_p_value(CRS_left, random_knockouts_CRS_scores)
sorted = get_log_sorted_values(temp, gseaType)
logs1 = get_gene_ranks_KO(sorted, bpmList, currentBPM, gseaType, 1)
logs2 = get_gene_ranks_compensatory(sorted, bpmList, currentBPM, gseaType, 0)
names1 = set_genenames_KO(logs1) #names1 and names2 will be used to store the names of the genes in logs1 and logs2, stored with the same indices as their corresponding information in logs1 and logs2,
names2 = set_genenames_compensatory(logs2) #so names1 and names2 effectively allow easy access to logs1 and logs2
write_current_knockout_stats(bpmList, currentBPM, k, outputFile, sorted, 1, 0)
shorter = determine_shorter_pathway(bpmList, currentBPM)
longer = determine_longer_pathway(currentBPM, bpmList)
pruned = []
pruned_common = []
a = 0
for item in bpmList[shorter][currentBPM]:
found = 0
good = "not yet"
if shorter == 1:
write_item_KO(bpmList, currentBPM, k, bpmList[1][currentBPM][a], outputFile, a, 1) #BIG CHANGE => a used to be k, bpmList[1][currentBPM][a] used to be item
write_items_common_name(bpmList, currentBPM, a, k, outputFile, 3)
found = 0
for x in range (len (temp)):
if (found == 1):
x = len(temp) + 1
else:
found = write_log_and_rank(temp, x, currentBPM, bpmList, outputFile, logs1, names1, a, 1)
found = 0
outputFile.write ('\t\t\t\t\t')
for z in range (len (temp)): # for the size of the hughes column
rank = get_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 0)
if rank != 0:
break
if (float(float(rank)/float(len(temp))) > pruneCut):
good = "yes"
pruned.append(bpmList[longer][currentBPM][a])
pruned_common.append(bpmList[longer + 2][currentBPM][a])
else:
good = "no"
write_item_compensatory(bpmList, currentBPM, a, item, outputFile, 0, good)
write_log_and_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 0)
if good == "yes":
outputFile.write("")
found = 0
outputFile.write ('
')
a+=1
else:
write_item_KO(bpmList, currentBPM, k, bpmList[1][currentBPM][a], outputFile, a, 1)#BIG CHANGE => a used to be k
write_items_common_name(bpmList, currentBPM, a, k, outputFile, 3)
found = 0
for x in range (len (temp)):
if (found == 1):
x = len(temp) + 1
else:
write_log_and_rank(temp, x, currentBPM, bpmList, outputFile, logs1, names1, a, 1)
found = 0
outputFile.write ('\t\t\t\t\t')
for z in range (len (temp)): # for the size of the hughes column
rank = get_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 0)
if rank != 0:
break
if (float(float(rank)/float(len(temp))) > pruneCut):
good = "yes"
pruned.append(bpmList[shorter][currentBPM][a])
pruned_common.append(bpmList[shorter + 2][currentBPM][a])
else:
good = "no"
write_item_compensatory(bpmList, currentBPM, a, item, outputFile, 0, good)
found = write_log_and_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 0)
if good == "yes":
outputFile.write("")
found = 0
outputFile.write ('
')
a+=1
for a in range (a, len(bpmList[longer][currentBPM])): #BIG CHANGE => changed from range(len(bpmList[longer][currentBPM]))
found = 0
good = "not yet"
if longer == 0:
outputFile.write ('\t\t\t\t\t\t\t\t\t\t\t\t\t')
for z in range (len (temp)): # for the size of the hughes column
rank = get_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 0)
if rank != 0:
break
if (float(float(rank)/float(len(temp))) > pruneCut):
good = "yes"
pruned.append(bpmList[longer][currentBPM][a])
pruned_common.append(bpmList[longer + 2][currentBPM][a])
else:
good = "no"
write_item_compensatory(bpmList, currentBPM, a, bpmList[longer][currentBPM][a], outputFile, 0, good) #BIG CHANGE => bpmList[longer][currentBPM][a] used to be item
found = 0
found = write_log_and_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 0)
if good == "yes":
outputFile.write("")
found = 0
outputFile.write ('
')
a+=1
else:
write_item_KO(bpmList, currentBPM, k, bpmList[longer][currentBPM][a], outputFile, a, 1) #BIG CHANGE => a used to be k
write_items_common_name(bpmList, currentBPM, a, k, outputFile, 3)
found = 0
for x in range (len (temp)):
if (found == 1):
x = len(temp) + 1
else:
found = write_log_and_rank(temp, x, currentBPM, bpmList, outputFile, logs1, names1, a, 1)
found = 0
outputFile.write ('
')
a+=1
write_scores(outputFile, pFile, pFileKO, CRS_left, CRS_right, random_genesets_pValLeft, random_genesets_pVal, random_knockouts_pValLeft, random_knockouts_pVal, currentBPM)
write_pruned(pruned, pruned_common, temp, outputFile, prunedPFile, prunedPFileKO, currentBPM, bpmList[3][currentBPM], bpmList[1][currentBPM], expression_data, knockout_genes_list)
def run():
response = determine_file_path()
expression_file_string = raw_input('\nPlease enter the file path to the expression data you would like to use.\n')
if (expression_file_string == "hughes"):
expression_file_string = "./Expression Data/data_expts1-300_meas.txt"
expression_data = open(expression_file_string, 'r')
gseaType = '3'
pruneCut = raw_input("\nWhat would you like to set the pruning cutoff to?\n")
file_strings = get_output_file_strings(gseaType)
knock = getKnockouts(expression_data)
knockout_genes_list = knock[3]
bpmList = create_bpm_list(response, knock)
# the bpmList contains all the information gathered above in one list, where columns [0] & [1] contain
# the genenames of pathways 1 and 2 for all BPMs, columns [2] & [3] contain the common names of pathways
# 1 and 2 for all BPMs, columns [4] & [5] contain the test locations of all the genes in pathways 1 and
# 2 of each BPM that are denoted by their genename, and columns [6] & [7] contain the test locations of
# all the genes in pathways 1 and 2 of each BPM that are denoted by their common name
ourList = ourGeneList(bpmList)
theirList = theirListLocations(ourList, knock[0])
pFile = open(file_strings[0], 'w')
outputFile = open(file_strings[1], 'w')
prunedPFile = open (file_strings[2], 'w')
pFileKO = open(file_strings[3], 'w')
prunedPFileKO = open(file_strings[4], 'w')
write_outputfile_header(outputFile, pruneCut, response)
print 'Beginning...'
for i in range(0, len(bpmList[0])): # for every BPM
print
print '\tBPM ', str(i), '...'
print
currentBPM = i
write_current_bpm_header(currentBPM, outputFile)
for j in range(0, len(bpmList[0][currentBPM])): # for every gene in pathway 1 of the current BPM
print '\t\tLooking at gene ', bpmList[0][currentBPM][j], ' for possible KO output'
if ((bpmList[4][currentBPM][j] != -1) and (len(bpmList[1][currentBPM]) > 2)):
# if there is a test for the current gene, and the length of the pathway on the opposite side is greater than 2
genelist = initialize_genelist(bpmList, j, 0, currentBPM)
temp0 = getHughesColumn(expression_data, bpmList[4][currentBPM][j]) # get the corresponding column from the hughes file
print '\t\tUsing gene ', bpmList[0][currentBPM][j], ' as the KO'
KO_output(genelist, bpmList, j, currentBPM, temp0, gseaType, outputFile, pFile, pFileKO, prunedPFile, prunedPFileKO, float(pruneCut), expression_data, knockout_genes_list)
elif ((bpmList[6][currentBPM][j] != -1) and (len(bpmList[1][currentBPM]) > 2)):
# if there was no test denoted by a genename, check if there is a test denoted by a common name for the
# current gene in pathway 1 of the current BPM, and make sure the opposite pathway has length > 2
genelist = initialize_genelist(bpmList, j, 0, currentBPM)
temp1 = getHughesColumn(expression_data, bpmList[6][currentBPM][j])
print '\t\tUsing gene ', bpmList[0][currentBPM][j], ' as the KO'
KO_output(genelist, bpmList, j, currentBPM, temp1, gseaType, outputFile, pFile, pFileKO, prunedPFile, prunedPFileKO, float(pruneCut), expression_data, knockout_genes_list)
print '\t\tFinished First Pathway...'
print
for k in range(0, len(bpmList[1][currentBPM])): # for every gene in pathway 2 in the current BPM
print '\t\tLooking at gene ', bpmList[1][currentBPM][k], ' for possible compensatory output'
if ((bpmList[5][currentBPM][k] != -1) and (len(bpmList[0][currentBPM]) > 2)):
# if there is a test denoted by genename for the current gene and the opposite pathway has length > 2
genelist = initialize_genelist(bpmList, k, 1, currentBPM)
temp2 = getHughesColumn(expression_data, bpmList[5][currentBPM][k])
print '\t\tUsing gene ', bpmList[1][currentBPM][k], ' as the KO'
compensatory_output(genelist, bpmList, k, currentBPM, temp2, gseaType, outputFile, pFile, pFileKO, prunedPFile, prunedPFileKO, float(pruneCut), expression_data, knockout_genes_list)
elif ((bpmList[7][currentBPM][k] != -1) and (len(bpmList[0][currentBPM]) > 2)):
# if the wasn't a test denoted by genename for the current gene but there is one denoted by common name
# and the length of the opposite pathway is greater than 2
genelist = initialize_genelist(bpmList, k, 1, currentBPM)
temp3 = getHughesColumn(expression_data, bpmList[7][currentBPM][k])
print '\t\tUsing gene ', bpmList[1][currentBPM][k], ' as the KO'
compensatory_output(genelist, bpmList, k, currentBPM, temp3, gseaType, outputFile, pFile, pFileKO, prunedPFile, prunedPFileKO, float(pruneCut), expression_data, knockout_genes_list)
print '\t\tFinished Second Pathway...'
outputFile.write ('